package site.hanzhe.example.jsoup.zhihu;

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.map.MapUtil;
import cn.hutool.core.util.StrUtil;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class QuestionMain {

    public static void main(String[] args) {
        // 爬取到文档对象
        String url = "https://www.zhihu.com/question/317945181/answer/3472922456";
        Document document = crawlDocument(url);
        // 重组head
        processHead(document);
        // 重组body
        processBody(document);

        // 输出为HTML文件
        String file = "E:/Workspace/Private/zhihu/question/" + System.currentTimeMillis() + ".html";
        FileUtil.writeString(document.toString(), file, StandardCharsets.UTF_8);

        System.out.println("ok");
    }

    private static Document crawlDocument(String url) {
        try {
            Connection connect = Jsoup.connect(url)
                    .method(Connection.Method.GET)
                    .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36")
                    .cookies(getCookies());
            return connect.execute().parse();
            // return Jsoup.parse(new File("E:/question.html"));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static Map<String, String> getCookies() {
        Map<String, String> cookieMap = MapUtil.newHashMap();
        String cookiesString = "";
        Pattern cookiesPattern = Pattern.compile("\\b\\w+=[^;]+");
        Matcher matcher = cookiesPattern.matcher(cookiesString);
        while (matcher.find()) {
            String[] cookieSplit = matcher.group().split("=", 2);
            cookieMap.put(cookieSplit[0], cookieSplit[1]);
        }
        return cookieMap;
    }

    private static void processHead(Document document) {
        Element head = document.head();
        for (Element element : head.getAllElements()) {
            if (StrUtil.containsAll(element.tagName(), "head", "title")) {
                continue;
            }
            String text = element.toString();
            if (text.contains("font-family") || text.contains("</title>")) {
                continue;
            }
            element.remove();
        }
        new Element("meta").attr("charset", "utf-8").appendTo(head);
        new Element("script").attr("src", "/static/index.js").appendTo(head);
        new Element("link").attr("href", "/static/index.css").attr("rel", "stylesheet").appendTo(head);
    }

    private static void processBody(Document document) {
        Element body = document.body();
        Element contentItemAnswerItem = body.getElementsByClass("ContentItem AnswerItem").first();
        Element richText = contentItemAnswerItem.getElementsByClass("RichText").first();
        body.empty();
        richText.appendTo(body);
    }

}
